********************************************
*** Dataset: Load and merge consistency data
********************************************
use "$data\4_individual_ano_vars.dta", clear
set seed 12345

keep if consent_individual == 1

* Create 2021 values, sort out missings in lagged dep. var
sort id_indiv wave, stable
foreach var of varlist $outcomes $outcomes2 mwage_ts {
    by id_indiv: gen `var'_raw_2021_temp = `var' if wave == 1
    by id_indiv: egen `var'_raw_2021 = max(`var'_raw_2021_temp)
    replace `var'_raw_2021 = . if wave == 1
}

* Extensive / Intensive Margin
foreach var in aminwage wrcon sosec {
    bys id: egen sum_`var' = sum(`var') if wave == 1
    gen `var'_some_temp = 1 if sum_`var' > 0 & wave == 1
    replace `var'_some_temp = 0 if sum_`var' == 0 & wave == 1
    bys id: egen `var'_some = max(`var'_some_temp)
}

* Loop through samples
foreach x of numlist 1/3 {
    preserve
    keep ${sample`x'}

    * Define weights
    gen status_old = status
    replace status = 1 if status == 4

    bysort id wave: gen noobs = _N
    bysort id status wave: gen noobs_oldnew = _N

    gen w5 = 1 / noobs

    gen noobs_new = noobs_oldnew if status == 2
    replace no_employees_new = noobs_new if noobs_new > no_employees_new & !missing(noobs_new)
    replace no_employees_new = emp_nouveaux if no_employees_new > emp_nouveaux
    gen emp_old = no_employees_alt - no_employees_new

    gen w6_temp = no_employees_old / noobs_oldnew if status == 1
    replace w6_temp = no_employees_new / noobs_oldnew if status == 2

    gen w7 = no_employees_alt / noobs
    replace w7 = 1 if w6_temp < 1

    gen w6 = w6_temp / noobs

    * Scale, trim, and rescale weights
    foreach w of varlist w5 w6 w7 w6_temp {
        sum `w'
        replace `w' = `r(N)' * `w' / `r(sum)'
        sum `w', detail
        replace `w' = `r(p5)' if `w' < `r(p5)'
        replace `w' = `r(p95)' if `w' > `r(p95)' & !missing(`w')
        sum `w'
        replace `w' = `r(N)' * `w' / `r(sum)'
    }

    * Create variable versions allowing for missings in lagged dep. var.
    foreach var of varlist $outcomes $outcomes2  mwage_ts {
        egen `var'_2021 = std(`var'_raw_2021)
        replace `var'_2021 = 0 if missing(`var'_raw_2021)
        gen missing_`var'_2021 = 0
        replace missing_`var'_2021 = 1 if missing(`var'_raw_2021)
    }

    * Identify outliers for wage regression
    local o lwage
    qui reg `o' assignment `o'_raw_2021 $strata $controls_balance missing_`o'_2021 if inlist(status, 1, 2, 4) & employer == 0
    predict dfits, dfits
    gen outl_lwage = 1 if abs(dfits) >= 2 * sqrt((1 + e(df_m)) / e(N)) & dfits != .
    drop dfits

    *local o lwage_2
    *qui reg `o' assignment `o'_raw_2021 $strata $controls_balance employer missing_`o'_2021 if inlist(status, 1, 2, 4) & employer == 0
    *predict dfits, dfits
    *gen outl_lwage_2 = 1 if abs(dfits) >= 2 * sqrt((1 + e(df_m)) / e(N)) & dfits != .
    *drop dfits

    * Set dep. vars to missing for outliers
    tab outl_lwage
    replace lwage = . if outl_lwage == 1
    replace mwage_ts = . if outl_lwage == 1

    *tab outl_lwage_2
    *replace lwage_2 = . if outl_lwage_2 == 1
    *replace mwage_2_ts = . if outl_lwage_2 == 1

    gen outl_mwage_ts = outl_lwage
    *gen outl_mwage_2_ts = outl_lwage_2

    foreach var of varlist lwage mwage_ts  {
        drop `var'_2021
        egen `var'_2021 = std(`var'_raw_2021) if outl_`var' != 1
        replace `var'_2021 = 0 if missing(`var'_raw_2021) & outl_`var' != 1
    }

    * Shorten names for heterogeneity regressions
    gen man = male_base
    gen sup = supervisor_base
    gen rel = relationship_base
    gen exp = exper6_base
    gen mar = married_cohabiting_base
    gen abj = abidjan_base
    gen uni = educ_tert_base
    recode atleast30_base (0 = 1) (1 = 0), gen(yth)
    gen a30 = atleast30_base
    gen old = (status == 1)
    replace old = 1 if status == 4 & wave == 3

    * Baseline values for heterogeneity regressions
    foreach var of varlist empquality_ind lwage mwage_ts  {
        sum `var'_raw_2021 ${sample`x'}, d
        gen `var'_am_2021 = 1 if `var'_raw_2021 > r(p50) & !missing(`var'_raw_2021)
        replace `var'_am_2021 = 0 if `var'_raw_2021 <= r(p50) & !missing(`var'_raw_2021)
    }

    *** Save dataset
    save "$data\4_individual_ano_reg_`x'.dta", replace
    restore
}
